In [1]:
library(readr)
count_matrix <- read_csv("/home/jovyan/GSE164073_Eye_count_matrix.csv")  
dim(count_matrix)
head(count_matrix, 3)
Rows: 27946 Columns: 19
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): Gene
dbl (18): MW1_cornea_mock_1, MW2_cornea_mock_2, MW3_cornea_mock_3, MW4_corne...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  1. 27946
  2. 19
A tibble: 3 × 19
GeneMW1_cornea_mock_1MW2_cornea_mock_2MW3_cornea_mock_3MW4_cornea_CoV2_1MW5_cornea_CoV2_2MW6_cornea_CoV2_3MW7_limbus_mock_1MW8_limbus_mock_2MW9_limbus_mock_3MW10_limbus_CoV2_1MW11_limbus_CoV2_2MW12_limbus_CoV2_3MW13_sclera_mock_1MW14_sclera_mock_2MW15_sclera_mock_3MW16_sclera_CoV2_1MW17_sclera_CoV2_2MW18_sclera_CoV2_3
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
A1BG 91131 86 77 69 72128 97112 96 95 82 82 70101 91 78 84
A1BG-AS1292284271232250241444311355274298322233241243279203278
A1CF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Preprocessing¶

In [2]:
count_matrix <- as.data.frame(count_matrix)  # convert tibble to data.frame

# Set gene symbols as row names
rownames(count_matrix) <- count_matrix$Gene

# Remove the Gene column so only numeric counts remain
count_matrix <- count_matrix[, -which(names(count_matrix) == "Gene")]

# Ensure counts are integers
count_matrix[] <- lapply(count_matrix, function(x) as.integer(round(x)))

# check data
head(count_matrix, 3)
A data.frame: 3 × 18
MW1_cornea_mock_1MW2_cornea_mock_2MW3_cornea_mock_3MW4_cornea_CoV2_1MW5_cornea_CoV2_2MW6_cornea_CoV2_3MW7_limbus_mock_1MW8_limbus_mock_2MW9_limbus_mock_3MW10_limbus_CoV2_1MW11_limbus_CoV2_2MW12_limbus_CoV2_3MW13_sclera_mock_1MW14_sclera_mock_2MW15_sclera_mock_3MW16_sclera_CoV2_1MW17_sclera_CoV2_2MW18_sclera_CoV2_3
<int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int>
A1BG 91131 86 77 69 72128 97112 96 95 82 82 70101 91 78 84
A1BG-AS1292284271232250241444311355274298322233241243279203278
A1CF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Create metadata¶

In [3]:
sample_names <- colnames(count_matrix)

# Suppose the 19 samples are in order:
# 6 cornea (3 mock + 3 CoV2), 6 limbus (3 mock + 3 CoV2), 6 sclera (3 mock + 3 CoV2), + 1 extra?
# Adjust as needed to match your dataset
tissue <- c(rep("cornea",6), rep("limbus",6), rep("sclera",6))
condition <- c(rep(c("mock","CoV2"), each=3), rep(c("mock","CoV2"), each=3), rep(c("mock","CoV2"), each=3))

# Build colData
colData <- data.frame(tissue = tissue, condition = condition)
rownames(colData) <- sample_names

# Check
nrow(colData)   # should be 19
ncol(count_matrix)  # should be 19
colData
18
18
A data.frame: 18 × 2
tissuecondition
<chr><chr>
MW1_cornea_mock_1corneamock
MW2_cornea_mock_2corneamock
MW3_cornea_mock_3corneamock
MW4_cornea_CoV2_1corneaCoV2
MW5_cornea_CoV2_2corneaCoV2
MW6_cornea_CoV2_3corneaCoV2
MW7_limbus_mock_1limbusmock
MW8_limbus_mock_2limbusmock
MW9_limbus_mock_3limbusmock
MW10_limbus_CoV2_1limbusCoV2
MW11_limbus_CoV2_2limbusCoV2
MW12_limbus_CoV2_3limbusCoV2
MW13_sclera_mock_1scleramock
MW14_sclera_mock_2scleramock
MW15_sclera_mock_3scleramock
MW16_sclera_CoV2_1scleraCoV2
MW17_sclera_CoV2_2scleraCoV2
MW18_sclera_CoV2_3scleraCoV2
In [4]:
## Check if all column names in count_matrix are in colData or metadata rownames

all(colnames(count_matrix) %in% rownames(colData))

# Check if all colData rownames are in count_matrix columns

all(rownames(colData) %in% colnames(count_matrix))

#Check the order of samples

#DESeq2 requires row order in colData to match column order in count_matrix:

all(rownames(colData) == colnames(count_matrix))
TRUE
TRUE
TRUE

Differential expression analysis using DeSeq2¶

In [49]:
#install.packages("BiocManager")
#BiocManager::install(version = "3.18")  

#BiocManager::install("DESeq2")
In [55]:
library(DESeq2)

dds <- DESeqDataSetFromMatrix(
  countData =  round(count_matrix),
  colData = colData,
  design = ~ condition   
)
converting counts to integer mode

Warning message in DESeqDataSet(se, design = design, ignoreRank):
“some variables in design formula are characters, converting to factors”
In [56]:
##  Filtering before DeSeq2 creating

dds <- dds[rowSums(counts(dds)) >= 10, ]  # keep genes with ≥10 counts across all samples
In [57]:
## Run DESeq2
dds <- DESeq(dds)
res <- results(dds)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

In [58]:
res
log2 fold change (MLE): condition mock vs CoV2 
Wald test p-value: condition mock vs CoV2 
DataFrame with 17925 rows and 6 columns
           baseMean log2FoldChange     lfcSE      stat    pvalue      padj
          <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
A1BG      90.387769      0.1449492  0.117706  1.231452 0.2181537  0.775805
A1BG-AS1 278.108499      0.0490181  0.112583  0.435394 0.6632767  0.952794
A2M      872.917260      0.8098853  0.741530  1.092182 0.2747531  0.812063
A2M-AS1   11.330923     -0.4885154  0.252834 -1.932160 0.0533398        NA
A2ML1      0.578192      1.1192600  1.825458  0.613139 0.5397843        NA
...             ...            ...       ...       ...       ...       ...
ZYG11A       17.652      0.3833286 0.3738318  1.025404 0.3051725  0.835643
ZYG11B     1384.163     -0.0983171 0.1080570 -0.909863 0.3628946  0.868647
ZYX        6717.754      0.2508181 0.1112974  2.253586 0.0242222  0.352814
ZZEF1      1362.867     -0.1285807 0.0928343 -1.385056 0.1660353  0.723257
ZZZ3       1186.311     -0.1190627 0.0749112 -1.589385 0.1119735  0.646802
In [59]:
summary(res)
out of 17925 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 125, 0.7%
LFC < 0 (down)     : 183, 1%
outliers [1]       : 0, 0%
low counts [2]     : 4518, 25%
(mean count < 14)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Filtering¶

In [11]:
# Filter out genes with significant p-values (padj < 0.05)
significant_genes <- res[!is.na(res$padj) & res$padj < 0.05, ]

# Identify upregulated genes (log2FoldChange > 0)
upregulated_genes <- significant_genes[significant_genes$log2FoldChange > 0, ]

# Identify downregulated genes (log2FoldChange < 0)
downregulated_genes <- significant_genes[significant_genes$log2FoldChange < 0, ]

# Get the total number of upregulated and downregulated genes
num_upregulated <- nrow(upregulated_genes)
num_downregulated <- nrow(downregulated_genes)

# Print the results
cat("Number of upregulated genes:", num_upregulated, "\n")
cat("Number of downregulated genes:", num_downregulated, "\n")
Number of upregulated genes: 83 
Number of downregulated genes: 146 

MA plot¶

In [12]:
plotMA(res, ylim = c(-5,5))

#In DESeq2, the MA plot is often used to visualize the effect of shrinkage estimators and to identify genes with significant differential expression, which are typically highlighted in color.
No description has been provided for this image

PCA plot¶

In [13]:
# Perform variance stabilizing transformation (VST) on DESeqDataSet (dds)
vsd <- vst(dds, blind = FALSE)

# Plot PCA without the table, remove grid lines, add axis lines, and include ticks
library(ggplot2)

# Generate the PCA plot without the data table
plotPCA(vsd, intgroup = "tissue", returnData = FALSE) + 
  theme_minimal() +  # Minimal theme for a clean look
  theme(
    axis.text = element_text(size = 12),  # Larger axis text
    axis.title = element_text(size = 14), # Larger axis titles
    legend.title = element_text(size = 12),  # Adjust legend title size
    legend.text = element_text(size = 10),   # Adjust legend text size
    panel.grid = element_blank(),  # Remove grid lines
    axis.line = element_line(size = 1, color = "black"),  # Add axis lines
    axis.ticks = element_line(size = 1, color = "black"),  # Add axis ticks
    axis.ticks.length = unit(0.07, "inches")
  ) +
  labs(
    title = "Principal Component Analysis (PCA) of Gene Expression",
    x = "Principal Component 1 (PC1)",  # Label for x-axis
    y = "Principal Component 2 (PC2)",  # Label for y-axis
    color = "Tissue Type"
  ) +
  theme(legend.position = "right")  # Position legend at the top
using ntop=500 top features by variance

Warning message:
“The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.”
No description has been provided for this image

Plotting VolcanoPlot¶

In [14]:
library(dplyr)
library(ggplot2)
library(ggrepel)

# Convert results to a data frame
res_df <- as.data.frame(res)

# Clean up results
res_df_clean <- res_df[!is.na(res_df$padj), ]

# Clean DESeq2 results
res_df_clean <- res_df[!is.na(res_df$padj), ]

# Add regulation status and convert to factor
res_df_clean <- res_df_clean %>%
  mutate(
    Regulation = case_when(
      log2FoldChange > 0.58 & padj < 0.05  ~ "Up",
      log2FoldChange < -0.58 & padj < 0.05 ~ "Down",
      TRUE ~ "Not Significant"  # Keep "Not Significant" for the rest
    ),
    Regulation = factor(Regulation, levels = c("Down", "Not Significant", "Up"))  # Define the factor levels
  )

# Filter points above -log10(padj) > 50
res_df_clean <- res_df_clean %>%
  filter(-log10(padj) <= 50)

# Select top 5 Up and Down genes
top_up <- res_df_clean %>% filter(Regulation == "Up") %>% arrange(padj) %>% slice_head(n = 5)
top_down <- res_df_clean %>% filter(Regulation == "Down") %>% arrange(padj) %>% slice_head(n = 5)
top_genes <- bind_rows(top_up, top_down)

# Volcano plot with enhanced aesthetics
volcano = ggplot(res_df_clean, aes(x = log2FoldChange, y = -log10(padj), color = Regulation)) +
  geom_point(alpha = 0.65, size = 1.5) +  # Keep only one geom_point() layer
  scale_color_manual(
    values = c("Down" = "blue", "Not Significant" = "grey", "Up" = "red"),  # Color for all 3 categories
    breaks = c("Down", "Not Significant", "Up")  # Ensure all categories are in the legend
  ) +
  geom_vline(xintercept = c(-0.58, 0.58), linetype = "dashed", color = "black") +  # threshold lines
  geom_hline(yintercept = 1.2, linetype = "dashed", color = "black") +         # significance threshold
  geom_text_repel(data = top_genes, aes(label = rownames(top_genes)), size = 3, max.overlaps = 50, show.legend = FALSE) +  # Disable legend for text labels
  theme_minimal() +
  theme(
    axis.line.y.left = element_line(color = "black", size = 0.8),  # Solid left y-axis
    axis.line.x = element_line(color = "black", linewidth = 0.8),       # Solid x-axis
    axis.line.y.right = element_blank(),                            # Remove right y-axis
    legend.position = c(0.8, 0.8),                                  # Custom position for the legend (X = 0.8, Y = 0.8)
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10),
    legend.key.height = unit(1.5, "lines"), 
    legend.key.width = unit(2, "lines"),
    legend.spacing.y = unit(0.5, "cm"),  # Increase space between items in the legend
    legend.box.spacing = unit(0.5, "cm"),  # Adjust space around the legend box
    axis.ticks.length = unit(0.1, "cm"),                             # Increase the tick length
    axis.ticks = element_line(color = "black", size = 0.8),         # Black color and size for ticks
    axis.ticks.length.x = unit(0.15, "cm"),
    axis.ticks.length.y = unit(0.15, "cm"),
    panel.grid.major = element_blank(),   # Remove major grid lines
    panel.grid.minor = element_blank()    # Remove minor grid lines
  ) +
  labs(
    x = "log2 FC",
    y = "-log10 Adjusted p-value"
  )

# Display the plot
volcano
Attaching package: ‘dplyr’


The following object is masked from ‘package:Biobase’:

    combine


The following object is masked from ‘package:matrixStats’:

    count


The following objects are masked from ‘package:GenomicRanges’:

    intersect, setdiff, union


The following object is masked from ‘package:GenomeInfoDb’:

    intersect


The following objects are masked from ‘package:IRanges’:

    collapse, desc, intersect, setdiff, slice, union


The following objects are masked from ‘package:S4Vectors’:

    first, intersect, rename, setdiff, setequal, union


The following objects are masked from ‘package:BiocGenerics’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Warning message:
“A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.”
No description has been provided for this image

Heatmap¶

In [15]:
install.packages("pheatmap")
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)

In [16]:
# Normalize counts using DESeq2
log_norm_counts <- log2(counts(dds, normalized = TRUE) + 1)

# Select top 50 variable genes
library(matrixStats)
top_var_genes <- head(order(rowVars(log_norm_counts), decreasing = TRUE), 50)
mat <- log_norm_counts[top_var_genes, ]

# Publication-ready color palette
pub_palette <- colorRampPalette(c("#5D3A9B", "white", "#D7191C"))(100) 

# Draw heatmap
library(pheatmap)
heatmap <- pheatmap(
  mat,
  scale = "row",
  clustering_distance_rows = "euclidean",
  clustering_distance_cols = "euclidean",
  clustering_method = "complete",
  color = pub_palette,
  show_rownames = TRUE,
  show_colnames = TRUE,
  fontsize_row = 8,
  fontsize_col = 10,
  border_color = NA,
  main = "Top 50 Variable Genes"
)

# Save plot
ggsave("heatmap.png", plot = heatmap, width = 10, height = 12)
heatmap
No description has been provided for this image

GO Enrichment Analysis¶

In [17]:
# Filter the results to get significant DEGs (adjusted p-value < 0.05)
deg <- res[which(res$padj < 0.05), ]
deg_genes <- rownames(deg)
In [18]:
install.packages("devtools")
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)

In [19]:
devtools::install_github("ctlab/fgsea")
devtools::install_github("GuangchuangYu/DOSE")
devtools::install_github("YuLab-SMU/enrichplot")
devtools::install_github("YuLab-SMU/clusterProfiler")
Skipping install of 'fgsea' from a github remote, the SHA1 (d6f1670c) has not changed since last install.
  Use `force = TRUE` to force installation

Downloading GitHub repo GuangchuangYu/DOSE@HEAD


Skipping 15 packages ahead of CRAN: zlibbioc, GenomeInfoDbData, GenomeInfoDb, XVector, Biostrings, KEGGREST, S4Vectors, IRanges, Biobase, BiocGenerics, AnnotationDbi, GO.db, BiocParallel, qvalue, GOSemSim

── R CMD build ─────────────────────────────────────────────────────────────────
* checking for file ‘/tmp/RtmpXTSN2B/remotesd66057730c/YuLab-SMU-DOSE-34a6365/DESCRIPTION’ ... OK
* preparing ‘DOSE’:
* checking DESCRIPTION meta-information ... OK
* checking for LF line-endings in source and make files and shell scripts
* checking for empty or unneeded directories
* looking to see if a ‘data/datalist’ file should be added
* building ‘DOSE_4.3.0.tar.gz’

Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)

Warning message in i.p(...):
“installation of package ‘/tmp/RtmpXTSN2B/filed6a08fb2f/DOSE_4.3.0.tar.gz’ had non-zero exit status”
Downloading GitHub repo YuLab-SMU/enrichplot@HEAD


Skipping 17 packages ahead of CRAN: zlibbioc, GenomeInfoDbData, GenomeInfoDb, XVector, Biostrings, KEGGREST, S4Vectors, IRanges, Biobase, BiocGenerics, AnnotationDbi, GO.db, BiocParallel, treeio, qvalue, GOSemSim, ggtree

── R CMD build ─────────────────────────────────────────────────────────────────
* checking for file ‘/tmp/RtmpXTSN2B/remotesd63488ce6/YuLab-SMU-enrichplot-bf86dcd/DESCRIPTION’ ... OK
* preparing ‘enrichplot’:
* checking DESCRIPTION meta-information ... OK
* checking for LF line-endings in source and make files and shell scripts
* checking for empty or unneeded directories
* building ‘enrichplot_1.29.2.001.tar.gz’

Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)

Warning message in i.p(...):
“installation of package ‘/tmp/RtmpXTSN2B/filed69eb5168/enrichplot_1.29.2.001.tar.gz’ had non-zero exit status”
Skipping install of 'clusterProfiler' from a github remote, the SHA1 (39395ce1) has not changed since last install.
  Use `force = TRUE` to force installation

In [20]:
# Install Bioconductor package manager if it's not already installed
install.packages("BiocManager")

# Install org.Hs.eg.db
BiocManager::install("org.Hs.eg.db")
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org

Bioconductor version 3.18 (BiocManager 1.30.26), R 4.3.3 (2024-02-29)

Warning message:
“package(s) not installed when version(s) same as or greater than current; use
  `force = TRUE` to re-install: 'org.Hs.eg.db'”
Old packages: 'DOSE', 'enrichplot', 'AER', 'av', 'boot', 'broom', 'checkmate',
  'collections', 'commonmark', 'cowplot', 'crosstalk', 'curl', 'data.table',
  'Deriv', 'dials', 'doBy', 'doFuture', 'evaluate', 'future', 'future.apply',
  'gert', 'GGally', 'gganimate', 'gghighlight', 'ggpubr', 'ggridges',
  'ggstats', 'glmnet', 'hardhat', 'haven', 'httpuv', 'httr2', 'infer', 'later',
  'mlr', 'modeldata', 'ndjson', 'parallelly', 'patchwork', 'pbkrtest',
  'pillar', 'plotly', 'pROC', 'promises', 'purrr', 'qpdf', 'quanteda', 'Rcpp',
  'RcppArmadillo', 'readtext', 'reticulate', 'RODBC', 'rprojroot', 'rsample',
  'rvest', 's2', 'sf', 'shiny', 'tibble', 'tidytext', 'units', 'usethis',
  'utf8', 'V8', 'waldo', 'workflows', 'xfun', 'XML', 'xml2'

In [41]:
# Ensure plots render inline
options(jupyter.plot_mimetypes = c("image/png"))
In [48]:
library(clusterProfiler)
library(org.Hs.eg.db)  # Load for human, adjust based on your organism

# Perform GO enrichment analysis (Biological Process - BP, Molecular Function - MF, Cellular Component - CC)
ego <- enrichGO(
  gene = deg_genes,             # DEGs (gene symbols)
  OrgDb = org.Hs.eg.db,         # Organism database
  keyType = "SYMBOL",           # Gene identifier type (e.g., SYMBOL, ENTREZID)
  ont = "BP",                   # Biological Process ontology (can be changed to MF or CC)
  pvalueCutoff = 0.05,          # Adjusted p-value threshold
  qvalueCutoff = 0.2            # FDR threshold (optional)
)

# View the results
#head(ego)
barplot(ego, color = "p.adjust", showCategory = 10)
No description has been provided for this image
In [ ]: